Load the required libraries
library(dplyr)
library(ggplot2)
library(tidyverse)
library(plotly)
library(heatmaply)
library(showtext)
library(RColorBrewer)
library(reshape2)
Join the training and test sets
train <- read.csv("./train.csv")
test <- read.csv("./test.csv")
test_Sub <- read.csv("./submission.csv")
country_isos <- read.csv("./wikipedia-iso-country-codes.csv", stringsAsFactors = FALSE)
covid19 <- inner_join(test, test_Sub, by = "ForecastId") |>
rename(Id = ForecastId) |>
bind_rows(train) |>
arrange(Country.Region, Date) |>
select(Id, Country.Region, Date, ConfirmedCases)
head(covid19)
## Id Country.Region Date ConfirmedCases
## 1 1 Afghanistan 2020-01-22 0
## 2 2 Afghanistan 2020-01-23 0
## 3 3 Afghanistan 2020-01-24 0
## 4 4 Afghanistan 2020-01-25 0
## 5 5 Afghanistan 2020-01-26 0
## 6 6 Afghanistan 2020-01-27 0
Filter
Filter to countries that reached at least 50 confirmed cases
covid19 <- covid19 |> filter(ConfirmedCases > 50)
Count the number of days for each country
covid19_numdays <- covid19 |>
distinct(Country.Region, Date) |>
group_by(Country.Region) |>
summarize(num_days = n())
Filter to countries with at least 14 days of data
covid19_mindays <- covid19_numdays |>
filter(num_days >= 14)
Filter the original data frame to selected countries
covid19 <- covid19 |>
filter(Country.Region %in% covid19_mindays$Country.Region)
Count unique countries
num_countries <- covid19 |>
distinct(Country.Region) |>
nrow()
cat("Number of selected countries:", num_countries, "\n")
## Number of selected countries: 36
Print unique countries
unique_countries <- covid19 |>
distinct(Country.Region) |>
pull()
cat("Selected countries:", unique_countries, "\n")
## Selected countries: Australia Austria Bahrain Belgium China Denmark Egypt Finland France Germany Greece Iceland India Iran Iraq Israel Italy Japan Korea, South Kuwait Lebanon Malaysia Netherlands Norway Portugal Qatar San Marino Singapore Slovenia Spain Sweden Switzerland Thailand US United Arab Emirates United Kingdom
Compute growth over 14 days
Subset for rows where ‘Country/Region’ is ‘China’
covid19_china <- covid19[covid19$Country.Region == 'China', ]
head(covid19_china)
## Id Country.Region Date ConfirmedCases
## 105 5581 China 2020-01-22 444
## 106 5582 China 2020-01-23 444
## 107 4839 China 2020-01-24 53
## 108 5583 China 2020-01-24 549
## 109 4561 China 2020-01-25 57
## 110 4840 China 2020-01-25 78
Group by ‘Country.Region’ and ‘Date’, and calculate the sum of other
columns
covid19_collapse_province <- covid19 |>
group_by(Country.Region, Date) |>
summarise_all(sum) |>
ungroup()
Subset for rows where ‘Country.Region’ is ‘China’, and select first
few rows
covid19_collapse_province_china <- covid19_collapse_province |>
filter(Country.Region == 'China')
head(covid19_collapse_province_china)
## # A tibble: 6 × 4
## Country.Region Date Id ConfirmedCases
## <chr> <chr> <int> <dbl>
## 1 China 2020-01-22 5581 444
## 2 China 2020-01-23 5582 444
## 3 China 2020-01-24 10422 602
## 4 China 2020-01-25 22336 958
## 5 China 2020-01-26 36863 1545
## 6 China 2020-01-27 60981 2337
Summary of COVID-19 cases for each country
covid19 <- covid19_collapse_province |>
group_by(`Country.Region`) |> #groups the data frame by country, keeps the last 14 rows for each country
slice_tail(n = 14) |>
group_by(`Country.Region`) |> #keeps only the last row for each country
slice_tail(n = 1) |>
ungroup()
covid19
## # A tibble: 36 × 4
## Country.Region Date Id ConfirmedCases
## <chr> <chr> <int> <dbl>
## 1 Australia 2020-03-24 6453 1971
## 2 Austria 2020-03-24 1644 5283
## 3 Bahrain 2020-03-24 1830 392
## 4 Belgium 2020-03-24 2202 4269
## 5 China 2020-03-24 150624 79199
## 6 Denmark 2020-03-24 16959 1713
## 7 Egypt 2020-03-24 8898 402
## 8 Finland 2020-03-24 9363 792
## 9 France 2020-03-24 19656 156
## 10 Germany 2020-03-24 10572 32986
## # ℹ 26 more rows
Country Abbreviations
Rename the columns
names(country_isos)[names(country_isos) == "English.short.name.lower.case"] <- "Country.Region"
names(country_isos)[names(country_isos) == "Alpha.2.code"] <- "country_abbr"
Subset the columns
country_isos <- subset(country_isos, select = c("Country.Region", "country_abbr"))
head(country_isos)
## Country.Region country_abbr
## 1 Afghanistan AF
## 2 Ã…land Islands AX
## 3 Albania AL
## 4 Algeria DZ
## 5 American Samoa AS
## 6 Andorra AD
Merge the data frames
covid19 <- merge(covid19, country_isos, by = "Country.Region") |> na.omit()
Display the first few rows
head(covid19)
## Country.Region Date Id ConfirmedCases country_abbr
## 1 Australia 2020-03-24 6453 1971 AU
## 2 Austria 2020-03-24 1644 5283 AT
## 3 Bahrain 2020-03-24 1830 392 BH
## 4 Belgium 2020-03-24 2202 4269 BE
## 5 China 2020-03-24 150624 79199 CN
## 6 Denmark 2020-03-24 16959 1713 DK
Big Five Personality Data
big5 <- read.csv("./data-final.csv", sep = "\t")
positively_keyed <- c("EXT1", "EXT3", "EXT5", "EXT7", "EXT9",
"EST1", "EST3", "EST5", "EST6", "EST7", "EST8", "EST9", "EST10",
"AGR2", "AGR4", "AGR6", "AGR8", "AGR9", "AGR10",
"CSN1", "CSN3", "CSN5", "CSN7", "CSN9", "CSN10",
"OPN1", "OPN3", "OPN5", "OPN7", "OPN8", "OPN9", "OPN10")
negatively_keyed <- c("EXT2", "EXT4", "EXT6", "EXT8", "EXT10",
"EST2", "EST4",
"AGR1", "AGR3", "AGR5", "AGR7",
"CSN2", "CSN4", "CSN6", "CSN8",
"OPN2", "OPN4", "OPN6")
big5[, big5$negatively_keyed] <- 6 - big5[, big5$negatively_keyed]
Country-Level Big 5 Aggregates
big5_new <- big5 |>
count(country, name = "counts") |>
filter(counts > 1000) |>
arrange(desc(counts)) |>
inner_join(big5 |> select(country, positively_keyed, negatively_keyed), by = "country") |>
na.omit()
cat("Number of countries:", n_distinct(big5_new$country), "\n")
## Number of countries: 58
cat("List of countries:\n", unique(big5_new$country), "\n")
## List of countries:
## US GB CA AU PH IN DE NONE NZ NO MY MX SE NL SG ID BR FR DK IE IT ES PL FI RO BE ZA CO HK PK RU AR CH AE TR PT GR VN HR AT CL RS CZ TH JP PE KR HU IL KE CN BG VE EC LT SA EG EE
Factor aggregation
big5_country_averages <- big5_new |>
mutate(across(EXT1:OPN10, as.numeric)) |>
group_by(country) |>
summarize(EXT = mean(EXT1:EXT10, na.rm = TRUE),
EST = mean(EST1:EST10, na.rm = TRUE),
AGR = mean(AGR1:AGR10, na.rm = TRUE),
CSN = mean(CSN1:CSN10, na.rm = TRUE),
OPN = mean(OPN1:OPN10, na.rm = TRUE)) |>
ungroup()
big5_country_averages
## # A tibble: 58 × 6
## country EXT EST AGR CSN OPN
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 AE 3.5 2.5 3.5 5 4
## 2 AR 3 5 4 4 4
## 3 AT 3.5 3 5 2.5 4
## 4 AU 3 4.5 3.5 4.5 5
## 5 BE 3 2.5 2.5 2.5 4
## 6 BG 2 4 3.5 4.5 5
## 7 BR 4 3 2.5 3 4.5
## 8 CA 2.5 4 2.5 2.5 4
## 9 CH 3 4.5 2 4 3.5
## 10 CL 3 4 2 3 5
## # ℹ 48 more rows
Country-level averages
top_5_ext_countries <- big5_country_averages |>
arrange(desc(EXT)) |>
head(5)
ggplot(data = top_5_ext_countries, aes(x = EXT, y = country)) +
geom_bar(stat = "identity", fill = "steelblue") +
labs(x = "Extraversion Score", y = "Country", title = "Top 5 Countries by Extraversion Score") +
theme_minimal()

Joining Big 5 Country Data to COVID-19 Data
covid19_big5 <- merge(covid19, big5_country_averages, by.x = "country_abbr", by.y = "country")
head(covid19_big5)
## country_abbr Country.Region Date Id ConfirmedCases EXT EST
## 1 AE United Arab Emirates 2020-03-24 25452 248 3.5 2.5
## 2 AT Austria 2020-03-24 1644 5283 3.5 3.0
## 3 AU Australia 2020-03-24 6453 1971 3.0 4.5
## 4 BE Belgium 2020-03-24 2202 4269 3.0 2.5
## 5 CH Switzerland 2020-03-24 19035 9877 3.0 4.5
## 6 CN China 2020-03-24 150624 79199 3.0 3.5
## AGR CSN OPN
## 1 3.5 5.0 4.0
## 2 5.0 2.5 4.0
## 3 3.5 4.5 5.0
## 4 2.5 2.5 4.0
## 5 2.0 4.0 3.5
## 6 3.0 4.5 3.5
factors <- c('EXT', 'EST', 'AGR', 'CSN', 'OPN')
factor_names <- c('Extraversion', 'Emotional Stability', 'Agreeableness', 'Conscientiousness', 'Openness')
for (i in seq_along(factors)) {
# Compute the correlation coefficient
corr <- cor.test(covid19_big5[,factors[i]], covid19_big5$ConfirmedCases)
text <- paste0("r = ", round(corr$estimate, 2), ", p = ", round(corr$p.value, 2))
# Create the plot
p <- ggplot(covid19_big5, aes_string(x=factors[i], y="ConfirmedCases")) +
geom_point() +
stat_smooth(method="lm") +
labs(title=paste0("Confirmed cases at 14 days after first 50 cases \n by average score on Big 5 factors ", factor_names[i], "\n", text),
x=factor_names[i], y="Confirmed Cases")
print(p)
}
## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

without China- atypical outlier because it was where the outbreak
started
for (i in seq_along(factors)) {
# Compute the correlation coefficient without China
corr <- cor.test(subset(covid19_big5, country_abbr != "CN")[,factors[i]],
subset(covid19_big5, country_abbr != "CN")$ConfirmedCases)
text <- paste0("r = ", round(corr$estimate, 2), ", p = ", round(corr$p.value, 2))
# Create the plot without China
p <- ggplot(subset(covid19_big5, country_abbr != "CN"), aes_string(x=factors[i], y="ConfirmedCases")) +
geom_point() +
stat_smooth(method="lm") +
labs(title=paste0("Confirmed cases at 14 days after first 50 cases \n by average score on Big 5 factors ", factor_names[i], "\n", text),
x=factor_names[i], y="Confirmed Cases")
print(p)
}
## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

OPN
covid19_big5 |>
select(country_abbr, OPN, ConfirmedCases, Country.Region) |> # creates a new data frame
arrange(desc(OPN)) |>
inner_join(country_isos, by = c("country_abbr" = "country_abbr")) |> # merges it with another data frame
select(-c(country_abbr, Country.Region.x)) # drops columns
## OPN ConfirmedCases Country.Region.y
## 1 5.0 1971 Australia
## 2 5.0 156 France
## 3 5.0 8077 United Kingdom
## 4 5.0 536 India
## 5 4.5 1713 Denmark
## 6 4.5 39885 Spain
## 7 4.5 743 Greece
## 8 4.5 2863 Norway
## 9 4.0 248 United Arab Emirates
## 10 4.0 5283 Austria
## 11 4.0 4269 Belgium
## 12 4.0 32986 Germany
## 13 4.0 1930 Israel
## 14 4.0 1193 Japan
## 15 3.5 9877 Switzerland
## 16 3.5 79199 China
## 17 3.5 402 Egypt
## 18 3.5 2286 Sweden
## 19 3.0 792 Finland
## 20 3.0 69176 Italy
## 21 3.0 5560 Netherlands
## 22 3.0 2362 Portugal
## 23 3.0 827 Thailand
## 24 2.5 558 Singapore
## 25 2.0 1624 Malaysia
New graph
Select the columns of interest and arrange the rows by country
covid19_big5_plot <- covid19_big5 |>
select(EXT, EST, AGR, CSN, OPN, Country.Region, ConfirmedCases) |>
arrange(Country.Region)
covid19_big5_plot$ConfirmedCases <- as.numeric(covid19_big5_plot$ConfirmedCases)
mat <- as.matrix(covid19_big5_plot[, -6]) # Exclude the last column (ConfirmedCases)
rownames(mat) <- covid19_big5_plot$Country.Region
Define custom color palette
my_palette <- colorRampPalette(c("#FFFFFF", "#DEB887", "#795548"), space = "rgb")(n = 299)
Define text
#font_files <- font_files() |> tibble()
#View(font_files)
font_add(family = 'Cooper Black', regular = "C:/Windows/Fonts/COOPBL.TTF")
showtext_auto()
Create heatmap
p1 <- heatmaply(mat,
dendrogram = "none",
xlab = "", ylab = "",
main = "",
scale = "column",
margins = c(60, 100, 40, 20),
colors = my_palette,
grid_color = "white",
grid_width = 0.00001,
titleX = FALSE,
hide_colorbar = FALSE,
branches_lwd = 0.1,
label_names = c("Country", "Trait", "Zscore"),
fontsize_row = 8, fontsize_col = 8,
labCol = colnames(mat),
labRow = rownames(mat),
heatmap_layers = list(theme(axis.line=element_blank(),
text=element_text(family="Cooper Black"))),
width = 1200, height = 1000)
Convert to plotly object and add layout
p1_ggplot <- ggplotly(p1, tooltip="label")
p1_ggplot <- p1_ggplot |>
layout(title = list(text = "COVID-19 confirmed cases on 2020-03-24 and big 5 Z-scores - Country comparison\n<span style='font-size: 14px'>For more details, move your mouse over the squares</span>",
font = list(size = 20, family = "Cooper Black"), y = 4),
xaxis = list(title = "Big 5 Z-scores and COVID-19", titlefont = list(size = 16, family = "Cooper Black")),
yaxis = list(title = list(text = "Country", titlefont = list(size = 16, family = "Cooper Black"), standoff = 5, y = -5)),
margin = list(t=100))
p1_ggplot
New graph 2
Create ggplot
p2 <- ggplot(mat_melted, aes(x = Var2, y = Var1)) +
geom_point(aes(size = Confirmed_Cases, fill = value), shape = 21, color = "black") +
theme(panel.background = element_blank(), panel.border = element_rect(colour = "black", fill = NA, size = 1),
axis.text.x = element_text(size = 12, family = "Cooper Black"),
axis.text.y = element_text(size = 12, family = "Cooper Black"),
axis.title = element_text(size = 16, family = "Cooper Black"),
legend.text = element_text(size = 14, family = "Cooper Black"), # Adjust legend text size
legend.title = element_text(size = 16, family = "Cooper Black")) +
labs(x = "Big 5 Trait", y = "Country", size = "Confirmed\nCases", fill = "Value",
title = "COVID-19 Confirmed Cases on 2020-03-24 and Big 5 Scores - Country Comparison") +
theme(plot.title = element_text(size = 18, family = "Cooper Black")) +
scale_fill_gradientn(colors = c("#FFFFFF", "#DEB887", "#795548"))
print(p2)
